{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e74bdf3a-1947-447c-a8f5-e520ae1f480f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import statsmodels.api as sm\n",
    "import matplotlib.pyplot as plt\n",
    "from scipy.stats import pointbiserialr\n",
    "from statsmodels.formula.api import ols\n",
    "from statsmodels.formula.api import mixedlm\n",
    "from pandas.api.types import CategoricalDtype\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from statsmodels.stats.multicomp import pairwise_tukeyhsd, MultiComparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "db717f85-9781-4e88-ba63-1e6ea011fec6",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('data.csv')\n",
    "\n",
    "df2 = df[df['manip_type'] != 'no'].copy()\n",
    "custom_order = ['erasing', 'copy-paste', 'splicing', 'photoshop-touchup']\n",
    "\n",
    "# Set the column as a categorical type with the specific order\n",
    "cat_type = CategoricalDtype(categories=custom_order, ordered=True)\n",
    "df2['manip_type'] = df2['manip_type'].astype(cat_type)\n",
    "\n",
    "# Sort the DataFrame by the custom order\n",
    "df_sorted = df2.sort_values(by='manip_type')\n",
    "\n",
    "\n",
    "df3 = df_sorted[df_sorted['manip_type'] != 'no'].copy()\n",
    "\n",
    "\n",
    "ctrl = df3.loc[df3.group_name=='control'].copy()\n",
    "tr1 = df3[df3.group_name=='treatment1'].copy()\n",
    "tr2 = df3[df3.group_name=='treatment2'].copy()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "71493f06-291d-47fd-8079-7ce3689772ab",
   "metadata": {},
   "source": [
    "# Finding 1"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e7a6eca4-6218-47b8-a024-d7777044e2c6",
   "metadata": {},
   "source": [
    "## Table 1 and Table 2 (Appendix: A)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "fbf9dbd9-b19e-4b12-8582-76b40bd62359",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Descriptive Statistics by Group:\n",
      "     Group      Mean  Std Dev  Frequency\n",
      "   control 18.021505 3.349168         93\n",
      "treatment1 22.095745 2.552962         94\n",
      "treatment2 22.259259 2.300966         81\n",
      "\n",
      "Analysis of Variance:\n",
      "        Source          SS    df         MS        F Prof > F\n",
      "Between Groups 1047.005874   2.0 523.502937  67.2899      0.0\n",
      " Within Groups 2061.650843 265.0   7.779815                  \n",
      "         Total 3108.656716 267.0  11.642909                  \n"
     ]
    }
   ],
   "source": [
    "# calculate correct classifications\n",
    "df['is_correct'] = df['classification'].apply(lambda x: 1 if x == 'correct' else 0)\n",
    "df_grouped = df.groupby(['user_id', 'group_name'])['is_correct'].sum().reset_index()\n",
    "\n",
    "group_stats = df_grouped.groupby('group_name')['is_correct'].agg(['mean', 'std', 'count']).reset_index()\n",
    "\n",
    "group_stats.columns = ['Group', 'Mean', 'Std Dev', 'Frequency']\n",
    "\n",
    "# run one-way ANOVA\n",
    "model = ols('is_correct ~ C(group_name)', data=df_grouped).fit()\n",
    "anova_table = sm.stats.anova_lm(model, typ=2)\n",
    "\n",
    "# calculate mean squares and create the ANOVA summary\n",
    "anova_summary = pd.DataFrame({\n",
    "    'Source': ['Between Groups', 'Within Groups', 'Total'],\n",
    "    'SS': [anova_table['sum_sq'].iloc[0], anova_table['sum_sq'].iloc[1], anova_table['sum_sq'].sum()],\n",
    "    'df': [anova_table['df'].iloc[0], anova_table['df'].iloc[1], anova_table['df'].sum()],\n",
    "})\n",
    "\n",
    "anova_summary['MS'] = anova_summary['SS'] / anova_summary['df']\n",
    "anova_summary['F'] = [anova_table['F'].iloc[0], '', '']\n",
    "anova_summary['Prof > F'] = [anova_table['PR(>F)'].iloc[0], '', '']\n",
    "\n",
    "print(\"Descriptive Statistics by Group:\")\n",
    "print(group_stats.to_string(index=False))\n",
    "\n",
    "print(\"\\nAnalysis of Variance:\")\n",
    "print(anova_summary.to_string(index=False))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e04def28-1a31-40e5-aff7-6a2efb1e4699",
   "metadata": {},
   "source": [
    "## Table 3 and Table 4 was produced with stata code (Appendix: A). The dataset for these codes are modified versions of the raw dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "12543c51-b359-407b-b9c8-2b510887abde",
   "metadata": {},
   "source": [
    "**// Stata code for regressions in Finding 1 Table 3**\n",
    "\n",
    "**// PLEASE USE THE image_lit.dta DATASET**\n",
    "\n",
    "**// t1vscon is feedback vs control**\n",
    "\n",
    "**// t2vscon is education+feedback vs control**\n",
    "\n",
    "**// CODE**\n",
    "\n",
    "reg correct male age education white socialmedia familiar conserve t1vscon  \n",
    "\n",
    "reg correct male age education white socialmedia familiar conserve t2vscon  \n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ccc62037-1dee-48cf-9543-36c3005e7fe9",
   "metadata": {},
   "source": [
    "**// Stata code for regressions in Finding 1 Table 4**\n",
    "\n",
    "**// PLEASE USE THE image_lit.dta DATASET**\n",
    "\n",
    "**//recode first 4 images as image1,image2,image3,image4**\n",
    "**//combine treatments into one group coded one and compare against control coded 0 for treatmentvscontrol**\n",
    "\n",
    "**// CODE**\n",
    "\n",
    "gen imagefourth = image1 + image2 + image3 + image\n",
    "\n",
    "reg imagefourth male age education white socialmedia familiar conserve treatmentvscontrol"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d07a02a7-4f87-44b5-87b8-d4b8531b0109",
   "metadata": {},
   "source": [
    "# Finding 2"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b78a6fbb-38b8-4e8e-9c3a-8f98b9d591b8",
   "metadata": {},
   "source": [
    "## Table 5 - Table 11 (Appendix: A)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "0f37744a-97ca-4ec2-826c-4c1c1bdac8f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "def anova_bon(df):\n",
    "    # Calculate whether classification is correct for each row\n",
    "    df['correct'] = df['classification'].apply(lambda x: 1 if x == 'correct' else 0)\n",
    "    \n",
    "    \n",
    "    # Group by only manipulation type (not user_id) and compute stats\n",
    "    group_stats = df.groupby('manip_type', observed=True)['correct'].agg(['mean', 'std', 'count']).reset_index()\n",
    "    group_stats.columns = ['Group', 'Mean', 'Std Dev', 'Frequency']\n",
    "    \n",
    "    # Run one-way ANOVA on the original data (not aggregated by user_id)\n",
    "    model = ols('correct ~ C(manip_type)', data=ctrl).fit()  # Use original 'ctrl' DataFrame\n",
    "    \n",
    "    anova_table = sm.stats.anova_lm(model, typ=2)\n",
    "    \n",
    "    # Perform Bonferroni post hoc test\n",
    "    mc = MultiComparison(df['correct'], df['manip_type'])\n",
    "    result = mc.tukeyhsd(alpha=0.05)\n",
    "    \n",
    "    # Convert results to a DataFrame for better readability\n",
    "    bonferroni_results = pd.DataFrame(data=result._results_table.data[1:], columns=result._results_table.data[0])\n",
    "    bonferroni_results.columns = ['group1', 'group2', 'meandiff', 'p-adj', 'lower', 'upper', 'reject']\n",
    "    \n",
    "    \n",
    "    # Calculate mean squares and create the ANOVA summary\n",
    "    anova_summary = pd.DataFrame({\n",
    "        'Source': ['Between Groups', 'Within Groups', 'Total'],\n",
    "        'SS': [anova_table['sum_sq'].iloc[0], anova_table['sum_sq'].iloc[1], anova_table['sum_sq'].sum()],\n",
    "        'df': [anova_table['df'].iloc[0], anova_table['df'].iloc[1], anova_table['df'].sum()],\n",
    "    })\n",
    "    \n",
    "    anova_summary['MS'] = anova_summary['SS'] / anova_summary['df']\n",
    "    anova_summary['F'] = [anova_table['F'].iloc[0], '', '']\n",
    "    anova_summary['Prof > F'] = [anova_table['PR(>F)'].iloc[0], '', '']\n",
    "    \n",
    "    # Display results\n",
    "    print(\"Descriptive Statistics by Group:\")\n",
    "    print(group_stats.to_string(index=False))\n",
    "    \n",
    "    print(\"\\nAnalysis of Variance:\")\n",
    "    print(anova_summary.to_string(index=False))\n",
    "    \n",
    "    print(\"\\nPost Hoc Bonferroni Test Results (Control):\")\n",
    "    print(bonferroni_results)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "750074de-248a-49a6-8b8b-a46a6aa2e64d",
   "metadata": {},
   "source": [
    "### Condition: Control. ANOVA and Post Hoc Bonferroni Test Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "0f7ff8ea-ada2-41d3-a0f7-7d8fae03f818",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Descriptive Statistics by Group:\n",
      "            Group     Mean  Std Dev  Frequency\n",
      "          erasing 0.401070 0.490772        374\n",
      "       copy-paste 0.567204 0.496130        372\n",
      "         splicing 0.620968 0.485800        372\n",
      "photoshop-touchup 0.649596 0.477741        371\n",
      "\n",
      "Analysis of Variance:\n",
      "        Source         SS     df       MS          F Prof > F\n",
      "Between Groups  13.826570    3.0 4.608857  19.379566      0.0\n",
      " Within Groups 353.163356 1485.0 0.237820                    \n",
      "         Total 366.989926 1488.0 0.246633                    \n",
      "\n",
      "Post Hoc Bonferroni Test Results (Control):\n",
      "              group1             group2  meandiff   p-adj   lower   upper  \\\n",
      "0         copy-paste            erasing   -0.1661  0.0000 -0.2580 -0.0743   \n",
      "1         copy-paste  photoshop-touchup    0.0824  0.0979 -0.0096  0.1744   \n",
      "2         copy-paste           splicing    0.0538  0.4356 -0.0382  0.1457   \n",
      "3            erasing  photoshop-touchup    0.2485  0.0000  0.1566  0.3404   \n",
      "4            erasing           splicing    0.2199  0.0000  0.1281  0.3117   \n",
      "5  photoshop-touchup           splicing   -0.0286  0.8544 -0.1207  0.0634   \n",
      "\n",
      "   reject  \n",
      "0    True  \n",
      "1   False  \n",
      "2   False  \n",
      "3    True  \n",
      "4    True  \n",
      "5   False  \n"
     ]
    }
   ],
   "source": [
    "anova_bon(ctrl)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ecea541e-09f3-48c9-bb0e-91aee162a29a",
   "metadata": {},
   "source": [
    "### Condition: Treatment 1 (Feedback). ANOVA and Post Hoc Bonferroni Test Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "80517c55-17dd-4014-999f-3ed490875fd1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Descriptive Statistics by Group:\n",
      "            Group     Mean  Std Dev  Frequency\n",
      "          erasing 0.617021 0.486761        376\n",
      "       copy-paste 0.686170 0.464666        376\n",
      "         splicing 0.750000 0.433590        376\n",
      "photoshop-touchup 0.670213 0.470762        376\n",
      "\n",
      "Analysis of Variance:\n",
      "        Source         SS     df       MS          F Prof > F\n",
      "Between Groups  13.826570    3.0 4.608857  19.379566      0.0\n",
      " Within Groups 353.163356 1485.0 0.237820                    \n",
      "         Total 366.989926 1488.0 0.246633                    \n",
      "\n",
      "Post Hoc Bonferroni Test Results (Control):\n",
      "              group1             group2  meandiff   p-adj   lower   upper  \\\n",
      "0         copy-paste            erasing   -0.0691  0.1732 -0.1562  0.0180   \n",
      "1         copy-paste  photoshop-touchup   -0.0160  0.9654 -0.1031  0.0711   \n",
      "2         copy-paste           splicing    0.0638  0.2350 -0.0233  0.1509   \n",
      "3            erasing  photoshop-touchup    0.0532  0.3958 -0.0339  0.1403   \n",
      "4            erasing           splicing    0.1330  0.0005  0.0459  0.2201   \n",
      "5  photoshop-touchup           splicing    0.0798  0.0862 -0.0073  0.1669   \n",
      "\n",
      "   reject  \n",
      "0   False  \n",
      "1   False  \n",
      "2   False  \n",
      "3   False  \n",
      "4    True  \n",
      "5   False  \n"
     ]
    }
   ],
   "source": [
    "anova_bon(tr1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "30a8baae-921c-4c25-83ba-f0f4fdbca03f",
   "metadata": {},
   "source": [
    "### Condition: Treatment 2 (Feedback+Education). ANOVA and Post Hoc Bonferroni Test Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "0d1f2c6a-c0f3-41bf-b71d-5851dd4d9271",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Descriptive Statistics by Group:\n",
      "            Group     Mean  Std Dev  Frequency\n",
      "          erasing 0.574074 0.495247        324\n",
      "       copy-paste 0.700617 0.458696        324\n",
      "         splicing 0.780864 0.414301        324\n",
      "photoshop-touchup 0.743827 0.437194        324\n",
      "\n",
      "Analysis of Variance:\n",
      "        Source         SS     df       MS          F Prof > F\n",
      "Between Groups  13.826570    3.0 4.608857  19.379566      0.0\n",
      " Within Groups 353.163356 1485.0 0.237820                    \n",
      "         Total 366.989926 1488.0 0.246633                    \n",
      "\n",
      "Post Hoc Bonferroni Test Results (Control):\n",
      "              group1             group2  meandiff   p-adj   lower   upper  \\\n",
      "0         copy-paste            erasing   -0.1265  0.0022 -0.2180 -0.0351   \n",
      "1         copy-paste  photoshop-touchup    0.0432  0.6169 -0.0482  0.1346   \n",
      "2         copy-paste           splicing    0.0802  0.1086 -0.0112  0.1717   \n",
      "3            erasing  photoshop-touchup    0.1698  0.0000  0.0783  0.2612   \n",
      "4            erasing           splicing    0.2068  0.0000  0.1154  0.2982   \n",
      "5  photoshop-touchup           splicing    0.0370  0.7247 -0.0544  0.1285   \n",
      "\n",
      "   reject  \n",
      "0    True  \n",
      "1   False  \n",
      "2   False  \n",
      "3    True  \n",
      "4    True  \n",
      "5   False  \n"
     ]
    }
   ],
   "source": [
    "anova_bon(tr2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8f63ba5a-1f95-4d4a-8bed-8c6b9af3aa36",
   "metadata": {},
   "source": [
    "# Finding 3"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "91fc74c3-2cff-45b6-94e8-d3387ee5cfee",
   "metadata": {},
   "source": [
    "## Table 12"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "247b71e4-d7d5-4249-a929-3d96c8f2a376",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.640719\n",
      "         Iterations 5\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:         classification   No. Observations:                 8576\n",
      "Model:                          Logit   Df Residuals:                     8574\n",
      "Method:                           MLE   Df Model:                            1\n",
      "Date:                Mon, 17 Mar 2025   Pseudo R-squ.:                 0.01240\n",
      "Time:                        15:39:42   Log-Likelihood:                -5494.8\n",
      "converged:                       True   LL-Null:                       -5563.8\n",
      "Covariance Type:            nonrobust   LLR p-value:                 7.269e-32\n",
      "==============================================================================\n",
      "                 coef    std err          z      P>|z|      [0.025      0.975]\n",
      "------------------------------------------------------------------------------\n",
      "const          0.9453      0.039     24.357      0.000       0.869       1.021\n",
      "time_spent    -0.0337      0.003    -10.589      0.000      -0.040      -0.027\n",
      "==============================================================================\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# Convert classification to binary\n",
    "df['classification'] = df['classification'].apply(lambda x: 1 if x == 'correct' else 0)\n",
    "\n",
    "# Define independent variables\n",
    "independent_vars = ['time_spent']\n",
    "\n",
    "# Add constant to the independent variables\n",
    "X = sm.add_constant(df[independent_vars])\n",
    "y = df['classification']\n",
    "\n",
    "# Fit the logistic regression model\n",
    "logit_model = sm.Logit(y, X)\n",
    "result = logit_model.fit()\n",
    "\n",
    "# Print the summary of the regression\n",
    "print(result.summary())\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "36035c86-1539-4008-b0c1-b605e72f2c76",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Point-Biserial Correlation: -0.1224655630613337, p-value: 5.1204597361634625e-30\n"
     ]
    }
   ],
   "source": [
    "correlation, p_value = pointbiserialr(df['time_spent'], df['classification'])\n",
    "print(f\"Point-Biserial Correlation: {correlation}, p-value: {p_value}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a7826496-1352-45ba-91d9-8dc0e7cbfe12",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
